#!/usr/bin/env Rscript
"> methepic_emseq_wgbs_per-pos_beta.R <
Reads in betas from MethylationEPIC arrays, EM-seq and WGBS, and compares them
on a per-position basis.
For short read tech (EM-seq/WGBS), coverage cutoffs are defined as having
at least 3 of 4 samples from the same method having a coverage of 5 and
above (i.e., EM-seq 3 of 4 with coverage >= 5 && WGBS 3 of 4 with coverage >= 5).
MethylationEPIC readouts are 'analogue' so no 'coverage' to deal with.
" -> doc
setwd('~/csiro/stopwatch/cpgberus/14_methepic_vs_emseq_wgbs/')
# colour codes are from Dark2 panel
EMSEQ_COLOR <- '#1b9e77'
EPIC_COLOR <- '#e7298a'
WGBS_COLOR <- '#7570b3'
suppressPackageStartupMessages({
library(Biostrings)
library(BSgenome.Hsapiens.UCSC.hg38)
library(cowplot)
library(ggplot2)
library(RColorBrewer)
})
# function to pretty-print diagnostic messages
diag_message <- function(...) {
message('[', format(Sys.time(), "%H:%M:%S"), '] ', ...)
}
# function to annotate line of best fit in scatterplot
lm_eqn <- function(df, x, y) {
# modified from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
model <- lm(as.formula(paste(y, '~', x)), df)
eq <- substitute(
italic(y) == c + m %.% italic(x)*','~italic(r) == rval*','~italic(p) == pval*','~italic(n) == n_df,
list(c = format(unname(coef(model)[1]), digits=3),
m = format(unname(coef(model)[2]), digits=3),
rval = format(cor(df[[x]], df[[y]]), digits=4),
pval = format(summary(model)$coefficients[2,4], digits=1),
n_df = format(nrow(df), big.mark=',')))
as.character(as.expression(eq))
}
# function to calculate sum/len of a binary string
pct_of_ones <- function(binary_string) {
int_vector <- as.integer(unlist(strsplit(binary_string, split = "")))
sum(int_vector) / length(int_vector) * 100
}
# load processed EPIC data
load('epic_betas_hg38.RData')
# fix EPIC GRanges--all of the current positions refer to the 'C' on the Watson
# strand. this is correct when strand is '+', but not correct when strand is
# '-' (methylated C on Crick would be the G on Watson)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
## seqnames ranges strand | WR025V1I WR025V9I WR069V1I
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric>
## cg18478105 chr20 63216298 + | 0.0173053 0.0199615 0.0217486
## cg09835024 chrX 24054523 + | 0.0326714 0.0517594 0.0346174
## cg14361672 chr9 128701657 - | 0.8865378 0.8594833 0.8773512
## cg01763666 chr17 82201630 - | 0.8258608 0.8342132 0.7833352
## cg12950382 chr14 104710399 - | 0.9221174 0.9384120 0.9030496
## cg02115394 chr13 114234693 - | 0.0870580 0.0858822 0.0855477
## cg25813447 chrX 38801258 + | 0.4203698 0.3365862 0.3448609
## cg07779434 chrX 14873227 + | 0.3743789 0.3703045 0.3686351
## cg13417420 chr12 12696225 + | 0.0308294 0.0230513 0.0306978
## cg12480843 chr8 73879050 + | 0.0257384 0.0236206 0.0288161
## WR069V9I
## <numeric>
## cg18478105 0.0168954
## cg09835024 0.0512708
## cg14361672 0.8487040
## cg01763666 0.8522306
## cg12950382 0.9232085
## cg02115394 0.0777868
## cg25813447 0.3797780
## cg07779434 0.3673453
## cg13417420 0.0264349
## cg12480843 0.0262686
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
## width seq names
## [1] 1 C cg18478105
## [2] 1 C cg09835024
## [3] 1 G cg14361672
## [4] 1 G cg01763666
## [5] 1 G cg12950382
## [6] 1 G cg02115394
## [7] 1 C cg25813447
## [8] 1 C cg07779434
## [9] 1 C cg13417420
## [10] 1 C cg12480843
ranges(epic_gr)[strand(epic_gr) == '-'] <- shift(ranges(epic_gr)[strand(epic_gr) == '-'], 1)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
## seqnames ranges strand | WR025V1I WR025V9I WR069V1I
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric>
## cg18478105 chr20 63216298 + | 0.0173053 0.0199615 0.0217486
## cg09835024 chrX 24054523 + | 0.0326714 0.0517594 0.0346174
## cg14361672 chr9 128701658 - | 0.8865378 0.8594833 0.8773512
## cg01763666 chr17 82201631 - | 0.8258608 0.8342132 0.7833352
## cg12950382 chr14 104710400 - | 0.9221174 0.9384120 0.9030496
## cg02115394 chr13 114234694 - | 0.0870580 0.0858822 0.0855477
## cg25813447 chrX 38801258 + | 0.4203698 0.3365862 0.3448609
## cg07779434 chrX 14873227 + | 0.3743789 0.3703045 0.3686351
## cg13417420 chr12 12696225 + | 0.0308294 0.0230513 0.0306978
## cg12480843 chr8 73879050 + | 0.0257384 0.0236206 0.0288161
## WR069V9I
## <numeric>
## cg18478105 0.0168954
## cg09835024 0.0512708
## cg14361672 0.8487040
## cg01763666 0.8522306
## cg12950382 0.9232085
## cg02115394 0.0777868
## cg25813447 0.3797780
## cg07779434 0.3673453
## cg13417420 0.0264349
## cg12480843 0.0262686
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
## width seq names
## [1] 1 C cg18478105
## [2] 1 C cg09835024
## [3] 1 C cg14361672
## [4] 1 C cg01763666
## [5] 1 C cg12950382
## [6] 1 C cg02115394
## [7] 1 C cg25813447
## [8] 1 C cg07779434
## [9] 1 C cg13417420
## [10] 1 C cg12480843
# load per-position data from EM-seq and WGBS
cov_df <- read.delim('../04_parse_bismark_covs/rarefied.coverages.tsv.gz')
beta_df <- read.delim('../04_parse_bismark_covs/rarefied.methpcts.tsv.gz')
# convert meth % values (0-100) in `beta_df` to betas (0-1)
beta_df[4:ncol(beta_df)] <- beta_df[4:ncol(beta_df)] / 100
head(beta_df)
## scaffold start_pos end_pos WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER
## 1 chr1 10469 10469 NaN NaN NaN NaN NaN
## 2 chr1 10470 10470 NaN NaN NaN NaN NaN
## 3 chr1 10471 10471 NaN NaN NaN NaN NaN
## 4 chr1 10472 10472 NaN NaN NaN NaN NaN
## 5 chr1 10484 10484 NaN NaN NaN NaN NaN
## 6 chr1 10485 10485 NaN NaN NaN NaN NaN
## WR069V1WR WR069V9ER WR069V9WR
## 1 0.33333 1 1.0
## 2 NaN NaN 1.0
## 3 0.66667 1 0.5
## 4 NaN NaN 1.0
## 5 0.66667 1 1.0
## 6 1.00000 NaN 1.0
# filtering `beta_df` for positions with min 5 coverage in ALL 8 rarefied
# samples was too stringent: initial union of all 55m positions (covered at
# least once in at least one sample) drops to < 1m post-filtering.
#
# requiring min 3 of 4 EM-seq samples && min 3 of 4 WGBS samples produces a
# more workable set of positions, ~7.7m.
#
# get numbers that summarises coverages across EM-seq and WGBS samples
diag_message('Number of positions with min 5 coverage in 4/4 EM-seq samples: ',
sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) == 4))
## [16:43:38] Number of positions with min 5 coverage in 4/4 EM-seq samples: 9568385
diag_message('Number of positions with min 5 coverage in 4/4 WGBS samples: ',
sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) == 4))
## [16:43:40] Number of positions with min 5 coverage in 4/4 WGBS samples: 3274673
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) >= 3))
## [16:43:43] Number of positions with min 5 coverage in 3/4 EM-seq samples: 27615442
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) >= 3))
## [16:43:47] Number of positions with min 5 coverage in 3/4 EM-seq samples: 12354217
# ... WGBS not performing too hot here, tsk tsk
# note: for these lines to work, positions in `beta_df` and `cov_df` must have
# the same order (guaranteed by upstream Python script). else `beta_df`
# will not be sliced properly
diag_message('Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5. ')
## [16:43:51] Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5.
diag_message('Original nrow: ', nrow(beta_df), '; ')
## [16:43:51] Original nrow: 57672689;
beta_df <- beta_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
cov_df <- cov_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
diag_message('filtered nrow: ', nrow(beta_df))
## [16:44:05] filtered nrow: 7744836
diag_message('Mean coverage of remaining positions are: ',
'EM-seq ', mean(as.matrix(cov_df[grepl('ER$', colnames(cov_df))])), '; ',
'WGBS ', mean(as.matrix(cov_df[grepl('WR$', colnames(cov_df))])))
## [16:44:05] Mean coverage of remaining positions are: EM-seq 7.59187689707051; WGBS 6.70286546416219
# again, WGBS not performing well in terms of coverage
# convert positions from `beta_df` into a GRanges object, so that intersecting
# EPIC data (already in GRanges object) is easy
beta_gr <- GRanges(paste(beta_df$scaffold, beta_df$start_pos, sep=':'))
mcols(beta_gr) <- beta_df[4:ncol(beta_df)]
head(beta_gr)
## GRanges object with 6 ranges and 8 metadata columns:
## seqnames ranges strand | WR025V1ER WR025V1WR WR025V9ER WR025V9WR
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <numeric> <numeric>
## [1] chr1 16244 * | 0.80000 0.77778 0.85714 0.6
## [2] chr1 17407 * | 0.88889 1.00000 1.00000 1.0
## [3] chr1 17452 * | 0.94118 1.00000 1.00000 1.0
## [4] chr1 17453 * | 0.90000 1.00000 1.00000 1.0
## [5] chr1 17478 * | 1.00000 1.00000 1.00000 1.0
## [6] chr1 17479 * | 0.92857 1.00000 0.92308 1.0
## WR069V1ER WR069V1WR WR069V9ER WR069V9WR
## <numeric> <numeric> <numeric> <numeric>
## [1] 0.71429 0.83333 0.50000 0.77778
## [2] 1.00000 1.00000 1.00000 0.83333
## [3] 1.00000 1.00000 1.00000 1.00000
## [4] 1.00000 1.00000 1.00000 1.00000
## [5] 1.00000 0.92308 1.00000 1.00000
## [6] 0.87500 1.00000 0.85714 1.00000
## -------
## seqinfo: 25 sequences from an unspecified genome; no seqlengths
# aggressively subset both GRanges to the intersection of both
diag_message('# positions in `epic_gr` before intersection: ', length(epic_gr))
## [16:44:29] # positions in `epic_gr` before intersection: 864755
epic_gr <- subsetByOverlaps(epic_gr, beta_gr, type='equal', ignore.strand=TRUE)
beta_gr <- subsetByOverlaps(beta_gr, epic_gr, type='equal', ignore.strand=TRUE)
epic_gr <- sort(sortSeqlevels(epic_gr), ignore.strand=TRUE)
beta_gr <- sort(sortSeqlevels(beta_gr), ignore.strand=TRUE)
diag_message('# positions in `epic_gr` after intersection: ', length(epic_gr))
## [16:44:30] # positions in `epic_gr` after intersection: 103670
# sanity check after subsetting: lengths are identical, sequence names are in
# same order, as well as start/end values
stopifnot(length(epic_gr) == length(beta_gr))
stopifnot(identical(seqnames(epic_gr), seqnames(beta_gr)))
stopifnot(identical(start(epic_gr), start(beta_gr)))
stopifnot(identical(end(epic_gr), end(beta_gr)))
# this allows for the cbinding of metadata together. store the metadata
# in `epic_gr`, which is better annotated than `beta_gr`
values(epic_gr) <- cbind(values(beta_gr), values(epic_gr))
# transform the metadata into a proper data.frame (not the 'DataFrame' S4 object
# used in GRanges), then calculate per-method mean methylation levels
wide_df <- as.data.frame(values(epic_gr))
wide_df$meanER <- rowMeans(wide_df[, grepl('ER$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanWR <- rowMeans(wide_df[, grepl('WR$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanI <- rowMeans(wide_df[, grepl('I$', colnames(wide_df))])
head(wide_df)
## WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620 1.00000 0.85714 1.00000 0.80000 1.00000 1.00000
## cg02288058 1.00000 0.80000 0.85000 0.85714 0.05882 0.06154
## cg11330075 0.71429 1.00000 0.80000 0.70000 0.83333 0.60000
## cg01068023 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## cg05440980 1.00000 1.00000 0.66667 1.00000 0.90000 0.50000
## cg25838738 1.00000 1.00000 1.00000 0.80000 1.00000 1.00000
## WR069V9ER WR069V9WR WR025V1I WR025V9I WR069V1I WR069V9I
## cg24335620 0.92308 1.00000 0.7768391 0.7299938 0.7739380 0.7501038
## cg02288058 0.03774 0.02174 0.3777661 0.2932215 0.3655422 0.3035048
## cg11330075 0.40000 1.00000 0.1661267 0.1625537 0.1878874 0.1681365
## cg01068023 1.00000 1.00000 0.8365245 0.8375699 0.8347850 0.8662742
## cg05440980 1.00000 1.00000 0.9088851 0.8995479 0.8970510 0.9004642
## cg25838738 1.00000 1.00000 0.8704974 0.8625925 0.8639985 0.8813814
## meanER meanWR meanI
## cg24335620 0.9807700 0.914285 0.7577187
## cg02288058 0.4866400 0.435105 0.3350087
## cg11330075 0.6869050 0.825000 0.1711761
## cg01068023 1.0000000 1.000000 0.8437884
## cg05440980 0.8916675 0.875000 0.9014871
## cg25838738 1.0000000 0.950000 0.8696174
# plot a PCA to visualise overall profile of datasets. note that PCA doesn't
# like dealing with NA, necessitating the use of complete.cases() to remove
# any rows with NA in them
set.seed(1337)
pca <- prcomp(t(wide_df[complete.cases(wide_df), ]), scale=TRUE)
pca_coords <- as.data.frame(pca$x)
eigs <- pca$sdev ^ 2
pca_df <- data.frame(PC1=pca_coords$PC1,
PC2=pca_coords$PC2,
PC3=pca_coords$PC3,
row.names=rownames(pca_coords))
pca_df$method <- rownames(pca_coords)
pca_df$method <- gsub('^.*ER$', 'EM-seq', pca_df$method)
pca_df$method <- gsub('^.*WR$', 'WGBS', pca_df$method)
pca_df$method <- gsub('^.*I$', 'EPIC', pca_df$method)
pca_df$sample <- gsub('ER$|WR$|I$', '', rownames(pca_df))
g1 <- ggplot(pca_df, aes(x=PC1, y=PC2, color=method, fill=method, shape=sample)) +
geom_point(size=3, alpha=0.5) +
scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_shape_manual(values=21:25) +
xlab(paste0('PC1 (', round(eigs[1] / sum(eigs) * 100, 2), '%)')) +
ylab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
theme_minimal(12) +
theme(legend.position='top', legend.box='vertical', legend.margin=margin())
g2 <- ggplot(pca_df, aes(x=PC2, y=PC3, color=method, fill=method, shape=sample)) +
geom_point(size=3, alpha=0.5) +
scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
scale_shape_manual(values=21:25) +
xlab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
ylab(paste0('PC3 (', round(eigs[3] / sum(eigs) * 100, 2), '%)')) +
theme_minimal(12) +
theme(legend.position='none', legend.box='vertical', legend.margin=margin())
plot_grid(g1, g2, ncol=1, align='v', rel_heights=c(0.55, 0.45))

ggsave('three-way.beta.pca.pdf', width=6, height=6)
# PCA indicates variation is largest across methods (specifically EPIC vs.
# the other two short read methods). There is some variation across samples,
# seen in PC3 (approx PC2 in terms of % variation explained). In PC3, the
# four-sided shapes are on top (WR025) while the three-sided ones are bottom
# (WR069). However, most variation still originate from choice of method,
# hence the calculation of per-method means; subsequent plots are based off
# these mean values
# plot EM-seq vs. WGBS
ggplot(wide_df, aes(x=meanER, y=meanWR)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position beta values, EM-seq vs. WGBS') +
xlab('EM-seq') +
ylab('WGBS') +
theme_minimal(12)

# plot EM-seq vs. EPIC
ggplot(wide_df, aes(x=meanER, y=meanI)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
ggtitle('Per-position beta values, EM-seq vs. EPIC') +
xlab('EM-seq') +
ylab('EPIC') +
theme_minimal(12)

# WGBS vs. EPIC
ggplot(wide_df, aes(x=meanWR, y=meanI)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
ggtitle('Per-position beta values, WGBS vs. EPIC') +
xlab('WGBS') +
ylab('EPIC') +
theme_minimal(12)

# does GC% influence the discrepancies in beta? re-do scatterplot, but colour
# points by GC%
#
# calculate GC% in a window of WINDOW_BP
WINDOW_BP <- 101 # this value should be odd, so that the base at midpoint
# is sandwiched by equal num of bases upstream and downstream
bp_per_side <- (WINDOW_BP - 1) / 2
# letterFrequency(CG) / letterfrequency(ACGT) deals with edge cases where windows
# contain N or non-ACGT bases, which get ignored
window_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, epic_gr + bp_per_side)
wide_df$gcpct <- as.vector(
letterFrequency(window_seqs, 'CG') / letterFrequency(window_seqs, 'ACGT') * 100)
head(wide_df)
## WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620 1.00000 0.85714 1.00000 0.80000 1.00000 1.00000
## cg02288058 1.00000 0.80000 0.85000 0.85714 0.05882 0.06154
## cg11330075 0.71429 1.00000 0.80000 0.70000 0.83333 0.60000
## cg01068023 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
## cg05440980 1.00000 1.00000 0.66667 1.00000 0.90000 0.50000
## cg25838738 1.00000 1.00000 1.00000 0.80000 1.00000 1.00000
## WR069V9ER WR069V9WR WR025V1I WR025V9I WR069V1I WR069V9I
## cg24335620 0.92308 1.00000 0.7768391 0.7299938 0.7739380 0.7501038
## cg02288058 0.03774 0.02174 0.3777661 0.2932215 0.3655422 0.3035048
## cg11330075 0.40000 1.00000 0.1661267 0.1625537 0.1878874 0.1681365
## cg01068023 1.00000 1.00000 0.8365245 0.8375699 0.8347850 0.8662742
## cg05440980 1.00000 1.00000 0.9088851 0.8995479 0.8970510 0.9004642
## cg25838738 1.00000 1.00000 0.8704974 0.8625925 0.8639985 0.8813814
## meanER meanWR meanI gcpct
## cg24335620 0.9807700 0.914285 0.7577187 64.35644
## cg02288058 0.4866400 0.435105 0.3350087 32.67327
## cg11330075 0.6869050 0.825000 0.1711761 49.50495
## cg01068023 1.0000000 1.000000 0.8437884 49.50495
## cg05440980 0.8916675 0.875000 0.9014871 46.53465
## cg25838738 1.0000000 0.950000 0.8696174 40.59406
# check distribution of GC% values, to select a colour scheme for the heatmap
# centered appropriately over the median
quantile(wide_df$gcpct, c(0.05, .1, .5, .9, .95))
## 5% 10% 50% 90% 95%
## 32.67327 34.65347 46.53465 60.39604 63.36634
# hmm. human genome GC% is 42%, but perhaps EPIC probes are preferentially
# chosen around CpG islands, bumping the GC% up slightly. choose a [30, 70]
# colour scale for subsequent plots
#
# plot EM-seq vs. WGBS and colour points by GC%
g3 <- ggplot(wide_df, aes(x=meanER, y=meanWR, color=gcpct)) +
geom_point(size=0.5, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position beta values') +
xlab('EM-seq') +
ylab('WGBS') +
theme_minimal(12) +
theme(legend.position=c(0.75, 0.1))
# plot EM-seq vs. EPIC and colour points by GC%
g4 <- ggplot(wide_df, aes(x=meanER, y=meanI, color=gcpct)) +
geom_point(size=0.5, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
ggtitle('') +
xlab('EM-seq') +
ylab('EPIC') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# plot WGBS vs. EPIC and colour points by GC%
g5 <- ggplot(wide_df, aes(x=meanWR, y=meanI, color=gcpct)) +
geom_point(size=0.5, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
ggtitle('') +
xlab('WGBS') +
ylab('EPIC') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# do, like, proper stats
wide_df$delta_wgbs_emseq <- wide_df$meanWR - wide_df$meanER
summary(lm(wide_df$delta_wgbs_emseq ~ wide_df$gcpct))
##
## Call:
## lm(formula = wide_df$delta_wgbs_emseq ~ wide_df$gcpct)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62561 -0.04527 0.00354 0.04517 0.53147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.373e-03 1.437e-03 -2.347 0.0189 *
## wide_df$gcpct -3.291e-06 2.998e-05 -0.110 0.9126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09231 on 103668 degrees of freedom
## Multiple R-squared: 1.162e-07, Adjusted R-squared: -9.53e-06
## F-statistic: 0.01204 on 1 and 103668 DF, p-value: 0.9126
g6 <- ggplot(wide_df, aes(x=gcpct, y=delta_wgbs_emseq)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(20, 80), ylim=c(-0.5, 0.5)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_wgbs_emseq'), parse=TRUE) +
ggtitle('Per-position residual beta values vs. GC%') +
xlab('GC%') +
ylab('Residual WGBS - EM-seq') +
theme_minimal(12)
wide_df$delta_ont_emseq <- wide_df$meanI - wide_df$meanER
g7 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_emseq)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(20, 80), ylim=c(-0.5, 0.5)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_ont_emseq'), parse=TRUE) +
ggtitle('') +
xlab('GC%') +
ylab('Residual EPIC - EM-seq') +
theme_minimal(12)
wide_df$delta_ont_wgbs <- wide_df$meanI - wide_df$meanWR
g8 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_wgbs)) +
geom_point(size=0.5, alpha=0.05) +
geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
coord_cartesian(xlim=c(20, 80), ylim=c(-0.5, 0.5)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df, 'gcpct', 'delta_ont_wgbs'), parse=TRUE) +
ggtitle('') +
xlab('GC%') +
ylab('Residual EPIC - WGBS') +
theme_minimal(12)
plot_grid(g3, g6, g4, g7, g5, g8, ncol=2, rel_heights=c(1, 1, 1),
labels=c('A', 'B', 'C', 'D', 'E', 'F'))
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

ggsave('three-way.beta_and_gc.pdf', width=10, height=12)
# sanity check: confirm that scatterplots, whilst slightly overplotted, are
# visually accurate--replots of these with fewer points (10k) should show
# similar trends
set.seed(42)
wide_df_10k <- wide_df[sample(nrow(wide_df), 10000), ]
g3_10k <- ggplot(wide_df_10k, aes(x=meanER, y=meanWR, color=gcpct)) +
geom_point(size=1, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df_10k, 'meanER', 'meanWR'), parse=TRUE) +
ggtitle('Per-position methylation levels') +
xlab('EM-seq (%)') +
ylab('WGBS (%)') +
theme_minimal(12) +
theme(legend.position=c(0.75, 0.1))
# plot EM-seq vs. EPIC and colour points by GC%
g4_10k <- ggplot(wide_df_10k, aes(x=meanER, y=meanI, color=gcpct)) +
geom_point(size=1, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df_10k, 'meanER', 'meanI'), parse=TRUE) +
ggtitle('') +
xlab('EM-seq (%)') +
ylab('EPIC (%)') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
# plot WGBS vs. EPIC and colour points by GC%
g5_10k <- ggplot(wide_df_10k, aes(x=meanWR, y=meanI, color=gcpct)) +
geom_point(size=1, alpha=0.2) +
scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
guide=guide_colorbar(direction='horizontal')) +
geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
coord_cartesian(xlim=c(0, 1), ylim=c(0, 1)) +
annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
label=lm_eqn(wide_df_10k, 'meanWR', 'meanI'), parse=TRUE) +
ggtitle('') +
xlab('WGBS (%)') +
ylab('EPIC (%)') +
theme_minimal() +
theme(legend.position=c(0.75, 0.1))
plot_grid(g3, g3_10k, g4, g4_10k, g5, g5_10k, ncol=2, rel_heights=c(1, 1, 1),
labels=c('A', 'B', 'C', 'D', 'E', 'F'))
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?
## The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical variable into a factor?

# ... uh, looks ok to me?
# hmm. there's some interesting trends in the high GC% region. how many positions
# have high GC% in its immediate context?
diag_message('Positions with GC > 70% in immediate context: ', sum(wide_df$gcpct > 70))
## [16:45:11] Positions with GC > 70% in immediate context: 1226
diag_message('Positions with GC > 75% in immediate context: ', sum(wide_df$gcpct > 75))
## [16:45:11] Positions with GC > 75% in immediate context: 235
# don't think analyses using these collection of points would be convincing...
# list deps used in this script
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blis-openmp/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] RColorBrewer_1.1-3 ggplot2_3.4.0
## [3] cowplot_1.1.1 BSgenome.Hsapiens.UCSC.hg38_1.4.4
## [5] BSgenome_1.64.0 rtracklayer_1.56.1
## [7] GenomicRanges_1.48.0 Biostrings_2.64.1
## [9] GenomeInfoDb_1.32.4 XVector_0.36.0
## [11] IRanges_2.30.1 S4Vectors_0.34.0
## [13] BiocGenerics_0.42.0
##
## loaded via a namespace (and not attached):
## [1] MatrixGenerics_1.8.1 Biobase_2.56.0
## [3] sass_0.4.4 jsonlite_1.8.3
## [5] splines_4.2.2 bslib_0.4.1
## [7] assertthat_0.2.1 highr_0.9
## [9] GenomeInfoDbData_1.2.8 Rsamtools_2.12.0
## [11] yaml_2.3.6 pillar_1.8.1
## [13] lattice_0.20-45 glue_1.6.2
## [15] digest_0.6.30 colorspace_2.0-3
## [17] htmltools_0.5.3 Matrix_1.5-3
## [19] XML_3.99-0.12 pkgconfig_2.0.3
## [21] zlibbioc_1.42.0 scales_1.2.1
## [23] BiocParallel_1.30.4 tibble_3.1.8
## [25] mgcv_1.8-41 generics_0.1.3
## [27] farver_2.1.1 cachem_1.0.6
## [29] withr_2.5.0 SummarizedExperiment_1.26.1
## [31] cli_3.4.1 magrittr_2.0.3
## [33] crayon_1.5.2 evaluate_0.18
## [35] fansi_1.0.3 nlme_3.1-160
## [37] textshaping_0.3.6 tools_4.2.2
## [39] BiocIO_1.6.0 lifecycle_1.0.3
## [41] matrixStats_0.63.0 stringr_1.4.1
## [43] munsell_0.5.0 DelayedArray_0.22.0
## [45] compiler_4.2.2 jquerylib_0.1.4
## [47] systemfonts_1.0.4 rlang_1.0.6
## [49] grid_4.2.2 RCurl_1.98-1.9
## [51] rjson_0.2.21 bitops_1.0-7
## [53] labeling_0.4.2 rmarkdown_2.18
## [55] restfulr_0.0.15 gtable_0.3.1
## [57] codetools_0.2-18 DBI_1.1.3
## [59] R6_2.5.1 GenomicAlignments_1.32.1
## [61] knitr_1.41 dplyr_1.0.10
## [63] fastmap_1.1.0 utf8_1.2.2
## [65] ragg_1.2.4 stringi_1.7.8
## [67] parallel_4.2.2 vctrs_0.5.1
## [69] tidyselect_1.2.0 xfun_0.35